R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(Metrics)
library(readr)
library(ggplot2)#for visualisation
library(corrplot)#for visualisation of correlation
## corrplot 0.92 loaded
library(mlbench) 
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(plotly)#converting ggplot to plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2)
library(lattice)
library(caret)
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(caTools)#for splittind data into testing and training data
library(dplyr) #manipulating dataframe
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mlbench)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_csv("C:/Users/FD_gi/Documents/Regression lineal/data/HousingPrices-Amsterdam-August-2021.csv")
## New names:
## Rows: 924 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Address, Zip dbl (6): ...1, Price, Area, Room, Lon, Lat
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
missmap(data,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.

#Checking for NA and missing values and removing them.
numberOfNA <- length(which(is.na(data)==T))
numberOfNA
## [1] 4
# Remove NA values
data <- data %>%
        drop_na()


str(data) 
## tibble [920 × 8] (S3: tbl_df/tbl/data.frame)
##  $ ...1   : num [1:920] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Address: chr [1:920] "Blasiusstraat 8 2, Amsterdam" "Kromme Leimuidenstraat 13 H, Amsterdam" "Zaaiersweg 11 A, Amsterdam" "Tenerifestraat 40, Amsterdam" ...
##  $ Zip    : chr [1:920] "1091 CR" "1059 EL" "1097 SM" "1060 TH" ...
##  $ Price  : num [1:920] 685000 475000 850000 580000 720000 450000 450000 590000 399000 300000 ...
##  $ Area   : num [1:920] 64 60 109 128 138 53 87 80 49 33 ...
##  $ Room   : num [1:920] 3 3 4 6 5 2 3 2 3 2 ...
##  $ Lon    : num [1:920] 4.91 4.85 4.94 4.79 4.9 ...
##  $ Lat    : num [1:920] 52.4 52.3 52.3 52.3 52.4 ...
dim(data)
## [1] 920   8
# Removing rownumbers
data$...1 <- NULL
dim(data)
## [1] 920   7
#remove zip and address
data$Zip <- NULL
data$Address <- NULL

dim(data)
## [1] 920   5
library(corrplot)
str(data)
## tibble [920 × 5] (S3: tbl_df/tbl/data.frame)
##  $ Price: num [1:920] 685000 475000 850000 580000 720000 450000 450000 590000 399000 300000 ...
##  $ Area : num [1:920] 64 60 109 128 138 53 87 80 49 33 ...
##  $ Room : num [1:920] 3 3 4 6 5 2 3 2 3 2 ...
##  $ Lon  : num [1:920] 4.91 4.85 4.94 4.79 4.9 ...
##  $ Lat  : num [1:920] 52.4 52.3 52.3 52.3 52.4 ...
corrplot(cor(data))

corrplot(cor(data),method='number')

# Highly correlated variables
correlated <- cor(data)
highCorr <- findCorrelation(correlated, cutoff=0.70)
highCorr
## [1] 2
names(data[highCorr])
## [1] "Area"
summary(data)
##      Price              Area             Room             Lon       
##  Min.   : 175000   Min.   : 21.00   Min.   : 1.000   Min.   :4.645  
##  1st Qu.: 350000   1st Qu.: 60.00   1st Qu.: 3.000   1st Qu.:4.856  
##  Median : 467000   Median : 83.00   Median : 3.000   Median :4.887  
##  Mean   : 622065   Mean   : 95.61   Mean   : 3.564   Mean   :4.889  
##  3rd Qu.: 700000   3rd Qu.:113.00   3rd Qu.: 4.000   3rd Qu.:4.922  
##  Max.   :5950000   Max.   :623.00   Max.   :14.000   Max.   :5.029  
##       Lat       
##  Min.   :52.29  
##  1st Qu.:52.35  
##  Median :52.36  
##  Mean   :52.36  
##  3rd Qu.:52.38  
##  Max.   :52.42
data
## # A tibble: 920 × 5
##     Price  Area  Room   Lon   Lat
##     <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 685000    64     3  4.91  52.4
##  2 475000    60     3  4.85  52.3
##  3 850000   109     4  4.94  52.3
##  4 580000   128     6  4.79  52.3
##  5 720000   138     5  4.90  52.4
##  6 450000    53     2  4.88  52.4
##  7 450000    87     3  4.90  52.4
##  8 590000    80     2  4.87  52.4
##  9 399000    49     3  4.85  52.4
## 10 300000    33     2  4.90  52.4
## # ℹ 910 more rows
#Let’s split the loaded dataset into train and test sets. We will use 75% of the data to train our models and 20% will be used to test the models..

set.seed(123)
ind <- sample(2, nrow(data), prob = c(0.8, 0.2), replace = T)
train <- data[ind == 1, ]
test <- data[ind == 2,]

dim(data)
## [1] 920   5
dim(train)
## [1] 737   5
dim(test)
## [1] 183   5
lm_model <- lm(Price ~ .,
              data = train)
lm_model
## 
## Call:
## lm(formula = Price ~ ., data = train)
## 
## Coefficients:
## (Intercept)         Area         Room          Lon          Lat  
##   -39179798         9095       -61030      -444657       789084
summary(lm_model)
## 
## Call:
## lm(formula = Price ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1665296  -133835     9973   112256  2368954 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.918e+07  2.111e+07  -1.856   0.0638 .  
## Area         9.095e+03  2.929e+02  31.049  < 2e-16 ***
## Room        -6.103e+04  1.053e+04  -5.795 1.02e-08 ***
## Lon         -4.447e+05  1.840e+05  -2.417   0.0159 *  
## Lat          7.891e+05  3.992e+05   1.977   0.0484 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 260000 on 732 degrees of freedom
## Multiple R-squared:  0.7456, Adjusted R-squared:  0.7443 
## F-statistic: 536.5 on 4 and 732 DF,  p-value: < 2.2e-16
#Predict
pLm <- predict(lm_model,test)
postResample(pLm,test$Price)
##         RMSE     Rsquared          MAE 
## 3.961636e+05 6.054453e-01 2.017786e+05
#Cross validation
x <- data.matrix(train)
y <- train$Price


control <- trainControl(method = "cv",
                        number = 10)

lineerCV <- train(Price~.,
                data = train,
                method = "lm",
                trControl = control )
lineerCV
## Linear Regression 
## 
## 737 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 664, 663, 663, 664, 663, 664, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   259936.1  0.7009155  172590.4
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lineerCV)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1665296  -133835     9973   112256  2368954 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.918e+07  2.111e+07  -1.856   0.0638 .  
## Area         9.095e+03  2.929e+02  31.049  < 2e-16 ***
## Room        -6.103e+04  1.053e+04  -5.795 1.02e-08 ***
## Lon         -4.447e+05  1.840e+05  -2.417   0.0159 *  
## Lat          7.891e+05  3.992e+05   1.977   0.0484 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 260000 on 732 degrees of freedom
## Multiple R-squared:  0.7456, Adjusted R-squared:  0.7443 
## F-statistic: 536.5 on 4 and 732 DF,  p-value: < 2.2e-16
#Predict
pLmCV <- predict(lineerCV,test)
postResample(pLmCV,test$Price)
##         RMSE     Rsquared          MAE 
## 3.961636e+05 6.054453e-01 2.017786e+05
plLinearSimple <-test %>% 
  ggplot(aes(Price,pLm)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of Price') +
  ylab('Predicted value of Price')+
  theme_bw()

ggplotly(plLinearSimple)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ridge <- train(Price~.,
               data = train,
               method = "glmnet",
               tuneGrid = expand.grid(alpha = 0,
                                      lambda = seq(0.0001,1,length=50)),
               trControl = control )


pRidge <- predict(ridge,test)
postResample(pRidge,test$Price)
##         RMSE     Rsquared          MAE 
## 4.008932e+05 6.135220e-01 1.959027e+05
plRidge <-test %>% 
  ggplot(aes(Price,pRidge)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of Price') +
  ylab('Predicted value of Price')+
  theme_bw()

ggplotly(plRidge)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
lasso <- train(Price~.,
               data = train,
               method = "glmnet",
               tuneGrid = expand.grid(alpha = 1,
                                      lambda = seq(0.0001,1,length=50)),
               trControl = control )


pLasso <- predict(lasso,test)
postResample(pLasso,test$Price)
##         RMSE     Rsquared          MAE 
## 3.959416e+05 6.064406e-01 2.013277e+05
plLasoo <-test %>% 
  ggplot(aes(Price,pLasso)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of Price') +
  ylab('Predicted value of Price')+
  theme_bw()

ggplotly(plLasoo)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
library(catboost)
#Separate x and y of train and test dataset, which will very useful when we using this in the catboost package.
library(dplyr)
y_train <- unlist(train[c('Price')])
X_train <- train %>% select(-Price)
y_valid <- unlist(test[c('Price')])
X_valid <- test %>% select(-Price)

#Convert the train and test dataset to catboost specific format using the load_pool function by mentioning x and y of both train and test.
train_pool <- catboost.load_pool(data = X_train, label = y_train)
test_pool <- catboost.load_pool(data = X_valid, label = y_valid)


#Create an input params for the CatBoost regression.
params <- list(iterations=500,
               learning_rate=0.01,
               depth=10,
               loss_function='RMSE',
               eval_metric='RMSE',
               random_seed = 55,
               od_type='Iter',
               metric_period = 50,
               od_wait=20,
               use_best_model=TRUE)

modelCatboost <- catboost.train(learn_pool = train_pool,params = params)
## You should provide test set for use best model. use_best_model parameter has been switched to false value.
## 0:   learn: 511270.0935606   total: 151ms    remaining: 1m 15s
## 50:  learn: 391212.1848769   total: 678ms    remaining: 5.97s
## 100: learn: 310950.4012498   total: 1.24s    remaining: 4.91s
## 150: learn: 256155.2699471   total: 1.73s    remaining: 4s
## 200: learn: 217953.7878002   total: 2.27s    remaining: 3.38s
## 250: learn: 191625.5903299   total: 2.78s    remaining: 2.75s
## 300: learn: 173229.6150395   total: 3.22s    remaining: 2.13s
## 350: learn: 159696.8924010   total: 3.74s    remaining: 1.59s
## 400: learn: 148480.9119280   total: 4.26s    remaining: 1.05s
## 450: learn: 138989.5437633   total: 4.75s    remaining: 516ms
## 499: learn: 131528.7954719   total: 5.26s    remaining: 0us
y_pred=catboost.predict(modelCatboost,test_pool)

catboostMetrics <- postResample(y_pred,test$Price)
catboostMetrics
##         RMSE     Rsquared          MAE 
## 3.239090e+05 7.841589e-01 1.144889e+05
plCatboost <-test %>% 
  ggplot(aes(Price,y_pred)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of Price') +
  ylab('Predicted value of Price')+
  theme_bw()

ggplotly(plCatboost)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'